1 Benchamarking of a Nextlfow pipeline for MS-based proteomics

library(tidyverse)
library(arrow)
library(magrittr)
library(fs)
library(ggbeeswarm)
library(UpSetR)
library(arrow)
library(pheatmap)
library(viridis)
library(paletteer)
library(diann)
library(biomaRt)
library(clusterProfiler)
library(org.Hs.eg.db)


set.seed(0)
setwd("~/Documents/EMBL_UniHD/mini-benchmark")


org_cols <- c(
  Ecoli    = "#007B53",
  Hsapiens = "#193F90"
)

## Overwrites all functions
select <- dplyr::select

1.1 Computing metrics

# Auxiliary functions to convert units

## Function to convert units to MB
get_MB <- function(x) {
  ## Trim whitespace
  x %<>% as.character() %>% 
    trimws()
  
  ## Gets numeric value
  val <- str_extract(x, "[0-9]+(\\.[0-9]+)?") %>% 
    as.numeric()
  
  ## Gets unit
  unit <- str_extract(x, "[A-Za-z]+") %>% 
    toupper()
  
  ## Gets MB
  value_mb = case_when(
    unit %in% c("KB", "K")  ~ val / 1024,
    unit %in% c("MB", "M")  ~ val,
    unit %in% c("GB", "G") ~ val * 1024,
    unit %in% c("TB", "T") ~ val * 1024 * 1024,
    TRUE ~ NA_real_
  )
  
  ## Return megabytes
  return(value_mb)
}


## Function to get time information into seconds
get_seconds <- function(x) {
  x %<>% as.character()
  
  ## Gets time with regex
  h <- x %>% 
    str_extract(., "\\d+(?=h)") %>% 
    as.numeric()
  m <- x %>% 
    str_extract(., "\\d+(?=m)") %>% 
    as.numeric()
  ## Regex to allow decimal seconds
  s <- x %>% 
    str_extract(., "\\d+(?:\\.\\d+)?(?=s)") %>% 
    as.numeric()
  
  # Replace NAa with 0
  h[is.na(h)] <- 0
  m[is.na(m)] <- 0
  s[is.na(s)] <- 0
  
  ## Return seconds
  return(h * 3600 + m * 60 + s)
}

Gets metadata from server after running ls -lh */*/data/*raw | tr -s ' ' '\t' > files_list.tsv and then secure copying it to the local root.

raw_info <- read_tsv("./files_list.tsv",
                     col_names = FALSE)

head(raw_info)
## # A tibble: 6 × 9
##   X1            X2 X3    X4    X5    X6       X7 X8     X9                      
##   <chr>      <dbl> <chr> <chr> <chr> <chr> <dbl> <time> <chr>                   
## 1 -rwxrwxr-x     1 efra  efra  7.2G  Apr      11 15:23  Astral/Ecoli/data/24100…
## 2 -rwxrwxr-x     1 efra  efra  7.2G  Apr      11 15:24  Astral/Ecoli/data/24100…
## 3 -rwxrwxr-x     1 efra  efra  7.2G  Apr      11 15:26  Astral/Ecoli/data/24100…
## 4 -rwxrwxr-x     1 efra  efra  12G   Apr      21 19:16  Astral/Hsapiens/data/24…
## 5 -rwxrwxr-x     1 efra  efra  12G   Apr      21 19:18  Astral/Hsapiens/data/24…
## 6 -rwxrwxr-x     1 efra  efra  12G   Apr      21 19:20  Astral/Hsapiens/data/24…
## Keeps just the necessary information
raw_info %<>% dplyr::select(c("X5", "X9")) %>% 
  purrr::set_names(c("size", "fullname"))

## Mutates the tibble
raw_info %<>% mutate(size_MB = get_MB(size)) %>% 
  separate(
    fullname,
    into = c("instrument", "organism", "subfolder", "file_name"),
    sep = "/"
  )

head(raw_info)
## # A tibble: 6 × 6
##   size  instrument organism subfolder file_name                          size_MB
##   <chr> <chr>      <chr>    <chr>     <chr>                                <dbl>
## 1 7.2G  Astral     Ecoli    data      241002_Astral_P3539_JS_30min_2mz_…   7373.
## 2 7.2G  Astral     Ecoli    data      241002_Astral_P3539_JS_30min_2mz_…   7373.
## 3 7.2G  Astral     Ecoli    data      241002_Astral_P3539_JS_30min_2mz_…   7373.
## 4 12G   Astral     Hsapiens data      241101_Astral_P3585_JS_DIA_60min_…  12288 
## 5 12G   Astral     Hsapiens data      241101_Astral_P3585_JS_DIA_60min_…  12288 
## 6 12G   Astral     Hsapiens data      241101_Astral_P3585_JS_DIA_60min_…  12288
ggplot(raw_info, aes(x = instrument, y = size_MB / 1024, colour = organism)) +
  geom_beeswarm(size = 2, alpha = 0.7) +
  scale_x_discrete(limits = c("FusionLumos", "Exploris480", "Astral")) +
  scale_colour_manual(
    name = "Organism",
    values = org_cols
  ) +
  theme_minimal() +
  ylim(0, 12.5) +
  labs(
    x = "Mass Spectrometer",
    y = "Raw file size (GB)"
  ) +
  ggtitle("Size of raw files per experiment")

## Collapse into average size of raw files
raw_summary <- raw_info %>%
  group_by(instrument, organism) %>%
  summarise(
    avg_raw_size = mean(size_MB, na.rm = TRUE),
    ## Cleans unnecessary metadata
    .groups  = "drop"
  ) %>% 
  ## Converts back to GB
  mutate(avg_raw_size = avg_raw_size / 1024)
fasta_info <- read_tsv("./fasta_list.tsv",
                       col_names = FALSE)
## Rows: 2 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (6): X1, X3, X4, X5, X6, X9
## dbl  (2): X2, X7
## time (1): X8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fasta_info
## # A tibble: 2 × 9
##   X1            X2 X3    X4    X5    X6       X7 X8     X9                      
##   <chr>      <dbl> <chr> <chr> <chr> <chr> <dbl> <time> <chr>                   
## 1 -rw-rw-r--     1 efra  efra  2.0M  Apr      15 15:49  fastas/Ecoli_UP00000062…
## 2 -rw-rw-r--     1 efra  efra  14M   Apr      15 15:49  fastas/Hsapiens_UP00000…
## Filter out unnecessary columns
fasta_info %<>% dplyr::select(c("X5", "X9")) %>% 
  purrr::set_names(c("size", "fullname")) %>% 
  mutate(size_MB = get_MB(size)) %>% 
  ## Get organism name with regex
  mutate(organism = str_extract(fullname, "(?<=/)[^/_]+(?=_)"))

Get times after doing grep -m1 'duration: <strong>' */*/*/nf_report.html > runtimes.txt

runtimes <- read_delim("runtimes.txt",
                       delim =  "          ", ## By some reason grep created 10 whitespaces
                       col_names = FALSE) %>% 
  set_names(c("run", "string")) %>% 
  mutate(time = get_seconds(string) / 60,
         instrument = word(run, 1, sep = "/"),
         organism = word(run, 2, sep = "/"),
         profile = word(run, 3, sep = "/")) %>% 
  select(!run) %>% 
  select(!string)



runtimes %>% 
  filter(!str_detect(profile, "1\\.9\\.1")) %>% 
  ggplot() +
  geom_beeswarm(aes(y = time / 60, x = instrument, colour = organism),
                size = 2,
                alpha = 0.8,
                ) +
  scale_colour_manual(
    name = "Organism",
    values = org_cols
  ) +
  geom_boxplot(aes(y = time / 60, x = instrument),
               width = .25, alpha = 0, 
               outlier.shape = NA) +
  scale_x_discrete(limits = c("FusionLumos", "Exploris480", "Astral")) +
  theme_minimal() +
  labs(
    x =  "Mass Spectrometer",
    y = "Total runtime (h)",
    title = "Runtime per experiment"
  )

runtimes %>% 
  filter(!str_detect(profile, "1\\.9\\.1")) %>% 
  ggplot() +
  geom_beeswarm(aes(y = time / 60, x = organism, colour = instrument),
                size = 2,
                alpha = 0.9,
  ) +
  geom_boxplot(aes(y = time / 60, x = organism),
               width = .25, alpha = 0, 
               outlier.shape = NA) +
  scale_x_discrete(limits = c("Ecoli", "Hsapiens")) +
  theme_minimal() +
  labs(
    x =  "Organism",
    y = "Total runtime (h)"
  )

size_time <- left_join(runtimes, raw_summary,
                       by = c("instrument", "organism"))



size_time %>% 
  filter(!str_detect(profile, "1\\.9\\.1")) %>% 
  ggplot(aes(x = avg_raw_size, y = time /60 ,
                        colour = organism, 
                        shape = instrument)) +
  geom_point(size = 3, alpha = 0.8) +
  scale_colour_manual(
    name = "Organism",
    values = org_cols
  ) +
  theme_minimal() +
  labs(
    x = "Average raw file size (GB)",
    y = "Total runtime (h)",
    title = "Runtime vs file size"
  )

size_time %>% 
  filter(!str_detect(profile, "1\\.9\\.1")) %>% 
  ggplot(aes(x = avg_raw_size, y = time /60 ,
                        colour = instrument,
                        shape = organism)) +
  geom_point(size = 2, alpha = 0.8) +
  theme_minimal() +
  labs(
    x = "Average raw file size (GB)",
    y = "Total runtime (h)",
    title = "Runtime vs file size"
  )

size_time %>%
  filter(!str_detect(profile, "1\\.9\\.1")) %>% 
  filter(instrument == "Astral") %>% 
  ggplot(aes(x = avg_raw_size, y = time /60 ,
             colour = organism)) +
  geom_point(size = 2, alpha = 0.8) +
  scale_colour_manual(
    name = "Organism",
    values = org_cols
  ) +
  theme_minimal() +
  labs(
    x = "Average raw file size (GB)",
    y = "Total runtime (h)" 
  ) +
  ggtitle("Astral runs")

size_time %>%
  filter(!str_detect(profile, "1\\.9\\.1")) %>% 
  filter(instrument == "Exploris480") %>% 
  ggplot(aes(x = avg_raw_size, y = time /60 ,
             colour = organism)) +
  geom_point(size = 2, alpha = 0.8) +
  scale_colour_manual(
    name = "Organism",
    values = org_cols
  ) +
  theme_minimal() +
  labs(
    x = "Average raw file size (GB)",
    y = "Total runtime (h)" 
  ) +
  ggtitle("Exploris480 runs")

size_time %>%
  filter(!str_detect(profile, "1\\.9\\.1")) %>% 
  filter(instrument == "FusionLumos") %>% 
  ggplot(aes(x = avg_raw_size, y = time /60 ,
             colour = organism)) +
  geom_point(size = 2, alpha = 0.8) +
  scale_colour_manual(
    name = "Organism",
    values = org_cols
  ) +
  theme_minimal() +
  labs(
    x = "Average raw file size (GB)",
    y = "Total runtime (h)" 
  ) +
  ggtitle("FusionLumos runs")

deconv_rt <- function(x, ms, org) {
  x %>% 
    filter(
      !str_detect(profile, "1\\.9\\.1"),
      instrument == ms,
      organism == org
    ) %>% 
    mutate(
      convert_tool = word(profile, 1, sep = "_"),
      diann_version = word(profile, 2, sep = "_"),
    ) %>% 
    mutate(
      diann_version = replace_na(diann_version, "diannv2.1")
    ) %>% 
    ggplot(aes(x = convert_tool, y = time / 60, shape = diann_version)) +
    geom_beeswarm(size = 3, color = "#18974C") +
    theme_minimal() +
    labs(
      x = "Raw conversion tool",
      y = "Total runtime (h)"
    ) +
    ggtitle(paste("Pipeline runtimes for", org, "measured in", ms))
}


deconv_rt(size_time, "Astral", "Hsapiens") +
  ylim(0, NA)

deconv_rt(size_time, "Astral", "Ecoli")  +
  ylim(0, NA)

## Gets names of the traces TSVs
traces_files <- dir_ls(
  path = ".",
  recurse = TRUE,
  glob = "*nf_trace.tsv"
)

## Reads all the trace files
traces_list <- traces_files %>% 
  purrr::set_names() %>% 
  map(read_tsv)
## Using map because it is a list of tibbles
traces_list %<>% map(
  ~ mutate(
    .x,
    
    ### Convert time metrics to seconds
    duration_seconds = get_seconds(duration),
    realtime_seconds = get_seconds(realtime),
    
    ### Convert memory metrics to MB
    peak_rss_MB = get_MB(peak_rss),
    peak_vmem_MB = get_MB(peak_vmem),
    rchar_MB = get_MB(rchar),
    wchar_MB = get_MB(wchar),
    
    ### Get CPU percentage
    cpu_percentage = .x %>% 
      pluck("%cpu") %>% 
      str_remove("%") %>% 
      as.numeric()
  )
)
  

## Adds metadata from the tibble name as column
## Use imap() instead if map2() for simplicity
traces_list %<>% imap(~ .x %>% 
                        mutate(
                          instrument = word(.y, 1, sep = "/"),
                          organism = word(.y, 2, sep = "/"),
                          profile = word(.y, 3, sep = "/")
                        ))


traces_list[1] %>% str()
## List of 1
##  $ Astral/Ecoli/diannv2.1/nf_trace.tsv: tibble [4 × 24] (S3: tbl_df/tbl/data.frame)
##   ..$ task_id         : num [1:4] 1 2 3 4
##   ..$ hash            : chr [1:4] "fe/803e00" "48/d239c6" "a6/987de5" "ee/aff423"
##   ..$ native_id       : num [1:4] 2913659 2915213 2924176 2924182
##   ..$ name            : chr [1:4] "diann_2_1_workflow:fasta_to_speclib" "diann_2_1_workflow:diann_search" "diann_2_1_workflow:copy_files" "diann_2_1_workflow:diann_stats (1)"
##   ..$ status          : chr [1:4] "COMPLETED" "COMPLETED" "COMPLETED" "COMPLETED"
##   ..$ exit            : num [1:4] 0 0 0 0
##   ..$ submit          : POSIXct[1:4], format: "2025-04-20 19:27:11" "2025-04-20 19:28:00" ...
##   ..$ duration        : chr [1:4] "48.8s" "8m 41s" "140ms" "9.1s"
##   ..$ realtime        : chr [1:4] "48.5s" "8m 40s" "28ms" "8.7s"
##   ..$ %cpu            : chr [1:4] "2366.8%" "1409.0%" "104.9%" "327.3%"
##   ..$ peak_rss        : chr [1:4] "5.7 GB" "18.4 GB" "0" "614.8 MB"
##   ..$ peak_vmem       : chr [1:4] "9.9 GB" "1.5 TB" "0" "7.9 GB"
##   ..$ rchar           : chr [1:4] "177.6 MB" "421.2 MB" "25.4 MB" "155.6 MB"
##   ..$ wchar           : chr [1:4] "175.2 MB" "70 MB" "25.3 MB" "421.2 KB"
##   ..$ duration_seconds: num [1:4] 48.8 521 8400 9.1
##   ..$ realtime_seconds: num [1:4] 48.5 520 1680 8.7
##   ..$ peak_rss_MB     : num [1:4] 5837 18842 NA 615
##   ..$ peak_vmem_MB    : num [1:4] 10138 1572864 NA 8090
##   ..$ rchar_MB        : num [1:4] 177.6 421.2 25.4 155.6
##   ..$ wchar_MB        : num [1:4] 175.2 70 25.3 0.411
##   ..$ cpu_percentage  : num [1:4] 2367 1409 105 327
##   ..$ instrument      : chr [1:4] "Astral" "Astral" "Astral" "Astral"
##   ..$ organism        : chr [1:4] "Ecoli" "Ecoli" "Ecoli" "Ecoli"
##   ..$ profile         : chr [1:4] "diannv2.1" "diannv2.1" "diannv2.1" "diannv2.1"

summarize everything

## Summarize total times per <instrument/organism/profile>
traces_times <- imap_dfr(traces_list, ~ {
  # split the list‐name (.y) into components
  comb <- str_split(.y, "/", simplify = TRUE)
  tibble(
    instrument = comb[1],
    organism = comb[2],
    profile = comb[3],
    ## Also convert to minutes
    total_duration_min = sum(.x$duration_seconds, na.rm = TRUE) %>% { . / 60 },
    total_realtime_min = sum(.x$realtime_seconds, na.rm = TRUE) %>% { . / 60 }
  )
})

1.2 Biological metrics

1.2.1 Quality Control

## Standard QC thresholds on DIA-NN reports
## Extracts high quality peptides
qc <- function(x) {
  x %>% 
    collect() %>% 
    filter(
      ## Standard thresholds
      Q.Value <= 0.01,
      Lib.Q.Value <= 0.01,
      Lib.PG.Q.Value <= 0.01,
      PG.Q.Value <= 0.05,
      ### Filters out contaminants
      !str_detect(Protein.Group, "contam_sp"),
      !str_detect(Protein.Ids, "contam_sp")
    ) %>% 
    pull(Precursor.Id) %>%
    unique()
}


## Function to perform qc on the report's precursor
## and filter them out from the matrices
clean <- function(report, mat) {
  out <- mat %>% 
    filter(Precursor.Id %in% qc(report)) %>%
    ## Collapses to precursor level and removes missing values
    mutate(
      ## Converts NAs to 0s
      across(c(ends_with("mzML"), ends_with("raw")),
             ~ replace_na(.x, 0))
    ) %>% 
    group_by(Precursor.Id) %>% 
    mutate(
      ## Sum intensities per precursors
      across(c(ends_with("mzML"), ends_with("raw")),
             sum),
    ) %>% 
    ungroup() %>% 
    filter(
      ## Filters our zeros
      !if_any(c(ends_with("mzML"), ends_with("raw")),
              ~. == 0)
    )
}
## Lists parquets
parquet_list <- dir_ls(
  path = ".",
  recurse = TRUE,
  glob = "*report.parquet"
) %>% 
  ## Reads all the parquet files
  purrr::set_names() %>% 
  map(open_dataset, format = "parquet")


## Lists DIA-NN v1.8 reports
diann_list <- dir_ls(
  path = ".",
  recurse = TRUE,
  glob = "*diann-report"
) %>% 
  ## Read DIA-NN v1.8 reports
  purrr::set_names() %>% 
  map(diann_load)


## Concatenates all
all_reports <- c(
  parquet_list,
  diann_list
)


## Lists matrices
pr_matrixes <- dir_ls(
  path = ".",
  recurse = TRUE,
  regexp = "*pr_matrix.tsv"
) %>% 
  ## Exclude firts passed
  str_subset("first-pass", negate = TRUE) %>% 
  purrr::set_names() %>% 
  map(read_tsv)
# Performs QC

## Gets <instrument/organism/profile> keys
keys <- all_reports %>% 
  names() %>% 
  str_replace("^((?:[^/]+/){2}[^/]+).*", "\\1")


keyed_reports <- all_reports %>% 
  set_names(
    sub("^((?:[^/]+/){2}[^/]+).*", "\\1", names(.) )
  )


keyed_matrices <- pr_matrixes %>% 
  set_names(
    sub("^((?:[^/]+/){2}[^/]+).*", "\\1", names(.) )
  )

pr_matrixes_cleaned <- imap(
  keyed_reports,
  ~ if(.y %in% names(keyed_matrices)) 
    clean(.x, keyed_matrices[[.y]])
)

1.2.2 Detected biological entities

# Functions to extract information from DIA-NN outputs

## Get protein groups from parquet DIA-NN output
pg <- function(x) {
  x %>% 
    collect() %>% 
    pull(Protein.Group) %>% 
    unique()
}

## Get proteins from parquet DIA-NN output
proteins <- function(x) {
  x %>% 
    collect() %>% 
    pull(Protein.Ids) %>% 
    unique()
}

## Get peptides from parquet DIA-NN output
peptides <- function(x) {
  x %>% 
    collect() %>% 
    pull(Precursor.Id) %>% 
    unique()
}
pg_list <- c(
  pr_matrixes_cleaned %>% map(pg)
  )


proteins_list <- c(
  pr_matrixes_cleaned %>% map(proteins)
)


peptides_list <- c(
  pr_matrixes_cleaned %>% map(peptides)
)
# Auxiliary functions to get Jaccard indexes 

## Computes Jaccard Index
jaccard <- function(x, y) {
  length(intersect(x, y)) / length(union(x, y))
}


## Commputes jaccard index pairwise of list
jpw <- function(x) {
  n <- names(x)
  ## Expands pairwise
  expand_grid(
    names1 = n,
    names2 = n,
  ) %>% 
    ## Maps the function fer columns
    mutate(
      index = map2_dbl(x[names1], x[names2], jaccard)
    ) %>% 
    ## Comes back to squared datafra,e
    pivot_wider(
      names_from = names2,
      values_from = index
    ) %>%
    column_to_rownames("names1") 
}

## Function to automatize heatmap plotting
jaccard_hm <- function(x, string) {
  x %>% 
    jpw() %>% 
      pheatmap(
        main  = string,
        cluster_rows = TRUE,
        cluster_cols = TRUE,
        border_color = NA,
        color = viridis(100),
        angle_col = 45,
        fontsize_row = 10,
        fontsize_col = 10,
        cellwidth  = 30,
        cellheight = 30
      )
}


## Function to automatize upset plotting
jaccard_us <- function(x) {
  x %>% 
    fromList() %>% 
    upset(nsets = ncol(.),
          order.by = "freq",
          decreasing = TRUE,
          nintersects = 10)
}

1.2.2.1 Protein groups

1.2.2.1.1 Homo sapiens
pg_hsapiens <- pg_list %>% 
  { .[str_detect(names(.), "Hsapiens")] }


pg_hsapiens_astral <- pg_hsapiens %>% 
  { .[str_detect(names(.), "Astral")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(pg_hsapiens_astral, "Astral H sapiens | Pairwise Jaccard Index of identified protein groups")

jaccard_us(pg_hsapiens_astral)

pg_hsapiens_exploris <- pg_hsapiens %>% 
  { .[str_detect(names(.), "Exploris480")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(pg_hsapiens_exploris, "Exploris480 H sapiens | Pairwise Jaccard Index")

jaccard_us(pg_hsapiens_exploris)

pg_hsapiens_lumos <- pg_hsapiens %>% 
  { .[str_detect(names(.), "FusionLumos")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(pg_hsapiens_lumos, "FusionLumos H sapiens | Pairwise Jaccard Index")

jaccard_us(pg_hsapiens_lumos)

1.2.2.1.2 Escherichia coli
pg_ecoli <- pg_list %>% 
  { .[str_detect(names(.), "Ecoli")] }


pg_ecoli_astral <- pg_ecoli %>% 
  { .[str_detect(names(.), "Astral")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(pg_ecoli_astral, "Astral E coli | Pairwise Jaccard Index")

jaccard_us(pg_ecoli_astral)

pg_ecoli_exploris <- pg_ecoli %>% 
  { .[str_detect(names(.), "Exploris480")] } %>% 
  set_names( word(names(.), 3, sep = "/") )

jaccard_hm(pg_ecoli_exploris, "Exploris480 E coli | Pairwise Jaccard Index")

jaccard_us(pg_ecoli_exploris)

pg_ecoli_lumos <- pg_ecoli %>% 
  { .[str_detect(names(.), "FusionLumos")] } %>% 
   set_names( word(names(.), 3, sep = "/") )

jaccard_hm(pg_ecoli_lumos, "FusionLumos E coli | Pairwise Jaccard Index")

jaccard_us(pg_ecoli_lumos)

1.2.2.2 Proteins

1.2.2.2.1 Homo sapiens
prots_hsapiens <- proteins_list %>% 
  { .[str_detect(names(.), "Hsapiens")] }


prots_hsapiens_astral <- prots_hsapiens %>% 
  { .[str_detect(names(.), "Astral")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(prots_hsapiens_astral, "Astral H sapiens | Pairwise Jaccard Index of identified proteins")

jaccard_us(prots_hsapiens_astral)

prots_hsapiens_exploris <- prots_hsapiens %>% 
  { .[str_detect(names(.), "Exploris480")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(prots_hsapiens_exploris, "Exploris480 H sapiens | Pairwise Jaccard Index")

jaccard_us(prots_hsapiens_exploris)

prots_hsapiens_lumos <- prots_hsapiens %>% 
  { .[str_detect(names(.), "FusionLumos")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(prots_hsapiens_lumos, "FusionLumos H sapiens | Pairwise Jaccard Index")

jaccard_us(prots_hsapiens_lumos)

1.2.2.2.2 Escherichia coli
prots_ecoli <- pg_list %>% 
  { .[str_detect(names(.), "Ecoli")] }


prots_ecoli_astral <- prots_ecoli %>% 
  { .[str_detect(names(.), "Astral")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(prots_ecoli_astral, "Astral E coli | Pairwise Jaccard Index")

jaccard_us(prots_ecoli_astral)

prots_ecoli_exploris <- prots_ecoli %>% 
  { .[str_detect(names(.), "Exploris480")] } %>% 
  set_names( word(names(.), 3, sep = "/") )

jaccard_hm(prots_ecoli_exploris, "Exploris480 E coli | Pairwise Jaccard Index")

jaccard_us(prots_ecoli_exploris)

prots_ecoli_lumos <- prots_ecoli %>% 
  { .[str_detect(names(.), "FusionLumos")] } %>% 
   set_names( word(names(.), 3, sep = "/") )

jaccard_hm(prots_ecoli_lumos, "FusionLumos E coli | Pairwise Jaccard Index")

jaccard_us(prots_ecoli_lumos)

1.2.2.3 Precursors

1.2.2.3.1 Homo sapiens
pepts_hsapiens <- peptides_list %>% 
  { .[str_detect(names(.), "Hsapiens")] }


pepts_hsapiens_astral <- pepts_hsapiens %>% 
  { .[str_detect(names(.), "Astral")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(pepts_hsapiens_astral, "Astral H sapiens | Pairwise Jaccard Index of identified precursors")

jaccard_us(pepts_hsapiens_astral)

pepts_hsapiens_exploris <- pepts_hsapiens %>% 
  { .[str_detect(names(.), "Exploris480")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(pepts_hsapiens_exploris, "Exploris480 H sapiens | Pairwise Jaccard Index")

jaccard_us(pepts_hsapiens_exploris)

pepts_hsapiens_lumos <- pepts_hsapiens %>% 
  { .[str_detect(names(.), "FusionLumos")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(pepts_hsapiens_lumos, "FusionLumos H sapiens | Pairwise Jaccard Index")

jaccard_us(pepts_hsapiens_lumos)

1.2.2.3.2 Escherichia coli
pepts_ecoli <- peptides_list %>% 
  { .[str_detect(names(.), "Ecoli")] }


pepts_ecoli_astral <- pepts_ecoli %>% 
  { .[str_detect(names(.), "Astral")] } %>% 
  set_names( word(names(.), 3, sep = "/") )


jaccard_hm(pepts_ecoli_astral, "Astral E coli | Pairwise Jaccard Index")

jaccard_us(pepts_ecoli_astral)

pepts_ecoli_exploris <- pepts_ecoli %>% 
  { .[str_detect(names(.), "Exploris480")] } %>% 
  set_names( word(names(.), 3, sep = "/") )

jaccard_hm(pepts_ecoli_exploris, "Exploris480 E coli | Pairwise Jaccard Index")

jaccard_us(pepts_ecoli_exploris)

pepts_ecoli_lumos <- pepts_ecoli %>% 
  { .[str_detect(names(.), "FusionLumos")] } %>% 
   set_names( word(names(.), 3, sep = "/") )

jaccard_hm(pepts_ecoli_lumos, "FusionLumos E coli | Pairwise Jaccard Index")

jaccard_us(pepts_ecoli_lumos)

1.2.3 Coefficient of variation

## Function to compute cross validations
compute_cv <- function(x) {
  x %>% 
    rowwise() %>% 
    mutate(
      cv = {
        c_across(c(ends_with("mzML"), ends_with("raw"))) %>% 
          log() %>% 
          { sd(.)/mean(.) }
      }
    ) %>% 
    ungroup()
}

## 


#pr_matrixes_cleaned %<>% lapply(compute_cv)
## Gets the common peptides detected across all profiles
common_peptds <- purrr::reduce(pepts_hsapiens_astral, intersect)

## Gets data just from Astral and Human
prs_hsapiens_astral <- pr_matrixes_cleaned %>% 
  { .[str_detect(names(.), "Hsapiens")] } %>% 
  { .[str_detect(names(.), "Astral")] } %>% 
  ## Just common peptides
  lapply(filter, Precursor.Id %in% common_peptds) %>% 
  ## Compute cvs
  lapply(compute_cv)
  ## Compute mean
  #lapply(compute_means)



## Extracts cvs
cvs <- prs_hsapiens_astral %>% 
  ## Creates tibble with sources and all cv's
  imap_dfr(
    ~ tibble(
      profile = .y,
      cv     = .x$cv
    )
  ) %>% 
  mutate(
    ## Deletes unnecessary info
    profile = word(profile, 3, sep = "/")
  )


## Violin and boxplot
ggplot(cvs, aes(x = profile, y = cv)) +
  geom_violin(color = "gray",
              fill = "#6CC24A",
              alpha = 0.5) +
  geom_boxplot(fill = "#18974C",
               width = .25,
               outlier.shape = NA) +
  theme_minimal() +
  scale_y_sqrt() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45,  hjust = 1, vjust = 1)) +
  labs(
    x = "Workflow",
    y = "Coefficient of Variation",
    title = "CV's across different workflows | Astral, H sapiens"
  )

## Just boxplot
ggplot(cvs, aes(x = profile, y = cv)) +
  geom_boxplot(fill = "#18974C",
               width = .5,
               outlier.shape = NA) +
  ylim(0, 0.035) +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
  labs(
    x = "Workflow",
    y = "Coefficient of Variation",
    title = "CV's across different workflows | Astral, H sapiens"
  ) 
## Warning: Removed 35422 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

1.2.4 Intensities of unidentified precursors

### All with peptides identified with newer DIA-NN versions
list_pepts_no_diann1_8 <- pepts_hsapiens_astral[c(
  "diannv2.1",
  "msconvert_diannv1.9.2",
  "msconvert_diannv2.0",
  "msconvert_diannv2.0.1",
  "msconvert_diannv2.0.2",
  "thermoraw_diannv1.9.2",
  "thermoraw_diannv2.0",
  "thermoraw_diannv2.0.1",
  "thermoraw_diannv2.0.2"
)]

## Intersection off all sets
pepts_no_diann1_8 <- Reduce(intersect, list_pepts_no_diann1_8)

## Intersection of set of dia-nn 1.8
pepts_diann1_8 <- c(
  pepts_hsapiens_astral$msconvert_diannv1.8.1,
  pepts_hsapiens_astral$thermoraw_diannv1.8.1
)

## Proteins identified by all versions but DIA-NN 1.8
pepts_unid <- setdiff(pepts_no_diann1_8, pepts_diann1_8)


## Flags if the precursor was identified by DIA-NN 1.8 or not in the precrusos matrix
all_prs_mat <- pr_matrixes_cleaned %>% 
  { .[str_detect(names(.), "Hsapiens")] } %>% 
  { .[str_detect(names(.), "Astral")] } %>% 
  map(
  ~.x %>% 
    mutate(is_unid = Precursor.Id %in% pepts_unid)
)

## Pivots everythin into a single object
all_long <- map_dfr(
  all_prs_mat,
  ~ .x %>%
    pivot_longer(
      cols = c(ends_with("mzML"), ends_with("raw")),
      names_to = "sample",
      values_to = "intensity"
    ),
  .id = "tibble_id"
)


## Plots all densities of all samples together
ggplot(all_long, aes(x = log(intensity),
                     color = !(is_unid),
                     group = interaction(sample, !(is_unid)))) +
  geom_density(alpha = 0.7) +
  theme_minimal() +
  labs(
    title = "Intensity densities of all Astral H. sapiens runs",
    x = "log(Intensity)",
    y = "Density",
    color = "Identified by DIA-NN 1.8"
  )

1.2.5 GO enrichment of unidentifed proteins

### All with proteins identified with newer DIA-NN versions
list_prots_no_diann1_8 <- prots_hsapiens_astral[c(
  "diannv2.1",
  "msconvert_diannv1.9.2",
  "msconvert_diannv2.0",
  "msconvert_diannv2.0.1",
  "msconvert_diannv2.0.2",
  "thermoraw_diannv1.9.2",
  "thermoraw_diannv2.0",
  "thermoraw_diannv2.0.1",
  "thermoraw_diannv2.0.2"
)]

#4 Intersection off all sets
prots_no_diann1_8 <- Reduce(intersect, list_prots_no_diann1_8)

## Union of set of dia-nn 1.8
prots_diann1_8 <- c(
  prots_hsapiens_astral$msconvert_diannv1.8.1,
  prots_hsapiens_astral$thermoraw_diannv1.8.1
)

## Proteins identified by all versions but DIA-NN 1.8
prots_unid <- setdiff(prots_no_diann1_8, prots_diann1_8)

## All proteins identified with all versions
all_prots <- unlist(prots_hsapiens_astral) %>% 
  unique()


## Mart
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")


## Convert ID's
all_prots_ids <- getBM(
  attributes = c("uniprotswissprot", "entrezgene_id"),
  filters = "uniprotswissprot",
  values = all_prots,
  mart = mart
) %>% 
  pull("entrezgene_id") %>% 
  as.character()

interest_ids <- getBM(
  attributes = c("uniprotswissprot", "entrezgene_id"),
  filters = "uniprotswissprot",
  values = prots_unid,
  mart = mart
) %>% 
  pull("entrezgene_id") %>%
  as.character()


## GO Enrichment Analysis

### Biological Process
go_bp <- enrichGO(
  gene = interest_ids,
  universe = all_prots_ids,
  OrgDb = org.Hs.eg.db,
  ont = "BP",
  pAdjustMethod = "BH",
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.05,
  readable = TRUE
)

### Molecular Function
go_mf <- enrichGO(
  gene = interest_ids,
  universe = all_prots_ids,
  OrgDb = org.Hs.eg.db,
  ont = "MF",
  pAdjustMethod = "BH",
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.05,
  readable = TRUE
)

### Cellular Component
go_cc <- enrichGO(
  gene = interest_ids,
  universe = all_prots_ids,
  OrgDb = org.Hs.eg.db,
  ont = "CC",
  pAdjustMethod = "BH",
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.05,
  readable = TRUE
)



## All ontologies
go_all <- enrichGO(
  gene = interest_ids,
  universe = all_prots_ids,
  OrgDb = org.Hs.eg.db,
  ont = "ALL",
  pAdjustMethod = "BH",
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.05,
  readable = TRUE
)


## Plots
barplot(go_mf)

dotplot(go_mf)

2 Session Info

sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Mexico_City
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] org.Hs.eg.db_3.18.0    AnnotationDbi_1.64.1   IRanges_2.36.0        
##  [4] S4Vectors_0.40.2       Biobase_2.62.0         BiocGenerics_0.48.1   
##  [7] clusterProfiler_4.10.1 biomaRt_2.58.2         diann_1.0.1           
## [10] paletteer_1.6.0        viridis_0.6.5          viridisLite_0.4.2     
## [13] pheatmap_1.0.12        UpSetR_1.4.0           ggbeeswarm_0.7.2      
## [16] fs_1.6.6               magrittr_2.0.3         arrow_19.0.1          
## [19] lubridate_1.9.4        forcats_1.0.0          stringr_1.5.1         
## [22] dplyr_1.1.4            purrr_1.0.2            readr_2.1.5           
## [25] tidyr_1.3.1            tibble_3.2.1           ggplot2_3.5.1         
## [28] tidyverse_2.0.0       
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.17.1       jsonlite_1.8.9         
##   [4] farver_2.1.2            rmarkdown_2.28          zlibbioc_1.48.2        
##   [7] vctrs_0.6.5             memoise_2.0.1           RCurl_1.98-1.16        
##  [10] ggtree_3.10.1           htmltools_0.5.8.1       progress_1.2.3         
##  [13] curl_5.2.3              gridGraphics_0.5-1      sass_0.4.9             
##  [16] bslib_0.8.0             plyr_1.8.9              cachem_1.1.0           
##  [19] igraph_2.1.1            lifecycle_1.0.4         pkgconfig_2.0.3        
##  [22] gson_0.1.0              Matrix_1.6-5            R6_2.5.1               
##  [25] fastmap_1.2.0           GenomeInfoDbData_1.2.11 aplot_0.2.3            
##  [28] digest_0.6.37           enrichplot_1.22.0       colorspace_2.1-1       
##  [31] rematch2_2.1.2          patchwork_1.3.0         RSQLite_2.3.9          
##  [34] labeling_0.4.3          filelock_1.0.3          fansi_1.0.6            
##  [37] timechange_0.3.0        httr_1.4.7              polyclip_1.10-7        
##  [40] compiler_4.3.3          bit64_4.5.2             withr_3.0.2            
##  [43] BiocParallel_1.36.0     DBI_1.2.3               highr_0.11             
##  [46] ggforce_0.4.2           MASS_7.3-60.0.1         rappdirs_0.3.3         
##  [49] HDO.db_0.99.1           tools_4.3.3             vipor_0.4.7            
##  [52] scatterpie_0.2.4        ape_5.8                 beeswarm_0.4.0         
##  [55] glue_1.8.0              nlme_3.1-166            GOSemSim_2.28.1        
##  [58] shadowtext_0.1.4        grid_4.3.3              reshape2_1.4.4         
##  [61] fgsea_1.28.0            generics_0.1.3          gtable_0.3.5           
##  [64] tzdb_0.4.0              data.table_1.16.2       hms_1.1.3              
##  [67] tidygraph_1.3.1         xml2_1.3.6              utf8_1.2.4             
##  [70] XVector_0.42.0          ggrepel_0.9.6           pillar_1.9.0           
##  [73] vroom_1.6.5             yulab.utils_0.1.7       splines_4.3.3          
##  [76] tweenr_2.0.3            treeio_1.26.0           BiocFileCache_2.10.2   
##  [79] lattice_0.22-6          bit_4.5.0               tidyselect_1.2.1       
##  [82] GO.db_3.18.0            Biostrings_2.70.3       knitr_1.48             
##  [85] gridExtra_2.3           xfun_0.48               graphlayouts_1.2.0     
##  [88] stringi_1.8.4           lazyeval_0.2.2          ggfun_0.1.6            
##  [91] yaml_2.3.10             evaluate_1.0.1          codetools_0.2-20       
##  [94] RcppEigen_0.3.4.0.2     ggraph_2.2.1            qvalue_2.34.0          
##  [97] ggplotify_0.1.2         cli_3.6.3               munsell_0.5.1          
## [100] jquerylib_0.1.4         Rcpp_1.0.13             GenomeInfoDb_1.38.8    
## [103] dbplyr_2.5.0            png_0.1-8               XML_3.99-0.17          
## [106] parallel_4.3.3          assertthat_0.2.1        blob_1.2.4             
## [109] prettyunits_1.2.0       DOSE_3.28.2             bitops_1.0-9           
## [112] tidytree_0.4.6          scales_1.3.0            crayon_1.5.3           
## [115] rlang_1.1.4             cowplot_1.1.3           fastmatch_1.1-4        
## [118] KEGGREST_1.42.0